import pandas as pd

# Neo4j 结点文件生成：
#
# 电影结点（电影ID，电影名称，标签）
#
# 导演结点（导演ID，导演名称，标签）
#
# 演员结点（演员ID，演员名称，标签）
#
# 类型结点（类型ID，类型名称，标签）


# # 下面主要获得电影、导演、演员、类型的集合，方便ID编码
df = pd.read_csv('Douban.csv', error_bad_lines=False)
df_film = df['filmname']
df_directors = df['directors']
df_actors = df['actors']
df_types = df['type']
# print(df)

filmID, directorID, actorID, typeID = [], [], [], []

# 获取电影的列表，数量，df表
film_list = list(df_film)
film_count = len(film_list)
df_film_name = pd.DataFrame(data=film_list, columns=['filmname'])
# print(film_list)
# print(film_count)
# print(df_film_name)

# 获取不重复的导演的列表、数量，df表
director_list = []
for director in df_directors:
    director_list.extend(director.split('/'))
director_list = list(set(director_list))
director_count = len(director_list)
df_director_name = pd.DataFrame(data=director_list, columns=['directorname'])
# print(director_list)
# print(director_count)
# print(df_director_name)

# 获取不重复的演员的列表、数量，df表
actor_list = []
for actor in df_actors:
    actor_list.extend(actor.split('/'))
actor_list = list(set(actor_list))
actor_count = len(actor_list)
df_actor_name = pd.DataFrame(data=actor_list, columns=['actorname'])
# print(actor_list)
# print(actor_count)
# print(df_actor_name)

# 获取不重复的类型的列表、数量，df表
type_list = []
for type in df_types:
    type_list.extend(type.split('/'))
type_list = list(set(type_list))
type_count = len(type_list)
df_type_name = pd.DataFrame(data=type_list, columns=['typename'])
# print(type_list)
# print(type_count)
# print(df_type_name)


# # 下面生成电影、导演、演员、类型的ID
# 生成电影ID
for i in range(1000001, 1000001+film_count):
    filmID.append(i)
df_film_id = pd.DataFrame(data=filmID, columns=['filmid'])
# print(df_film_id)
# 生成导演ID
for i in range(2000001, 2000001+director_count):
    directorID.append(i)
df_director_id = pd.DataFrame(data=directorID, columns=['directorid'])
# print(df_director_id)
# 生成导演ID
for i in range(3000001, 3000001+actor_count):
    actorID.append(i)
df_actor_id = pd.DataFrame(data=actorID, columns=['actorid'])
# print(df_actor_id)
# 生成导演ID
for i in range(4000001, 4000001+type_count):
    typeID.append(i)
df_type_id = pd.DataFrame(data=typeID, columns=['typeid'])
# print(df_type_id)

# # 拼接结点数据
# 拼接电影表
film_table = pd.concat([df_film_id, df_film_name], axis=1)
film_table['label'] = '电影'

# 拼接导演表
director_table = pd.concat([df_director_id, df_director_name], axis=1)
director_table['label'] = '导演'

# 拼接演员表
actor_table = pd.concat([df_actor_id, df_actor_name], axis=1)
actor_table['label'] = '演员'

# 拼接演员表
type_table = pd.concat([df_type_id, df_type_name], axis=1)
type_table['label'] = '类型'

# # 生成结点文件
# 生成电影结点文件
film_table.columns = ['index:ID', 'film', ':LABEL']
film_table.to_csv('out/film.csv', index=False, encoding='utf-8')
print('电影结点导出到CSV成功')

# 生成导演结点文件
director_table.columns = ['index:ID', 'director', ':LABEL']
director_table.to_csv('out/director.csv', index=False, encoding='utf-8')
print('导演结点导出到CSV成功')

# 生成演员结点文件
actor_table.columns = ['index:ID', 'actor', ':LABEL']
actor_table.to_csv('out/actor.csv', index=False, encoding='utf-8')
print('演员结点导出到CSV成功')

# 生成类型结点文件
type_table.columns = ['index:ID', 'type', ':LABEL']
type_table.to_csv('out/type.csv', index=False, encoding='utf-8')
print('类型结点导出到CSV成功')


